nsys profile -o dpsk_gen_code --stats=true --sample=cpu \
    --trace=cuda,cudnn,cublas,nvtx,osrt,oshmem \
    --cudabacktrace=kernel:1000000,sync:1000000,memory:1000000 \
    ./workloads/Sample/sliding_quant.exe

nsys profile -o torch_code --stats=true --sample=cpu \
    --trace=cuda,cudnn,cublas,nvtx,osrt,oshmem \
    --cudabacktrace=kernel:1000000,sync:1000000,memory:1000000 \
    python workloads/Sample/sliding_quant_torch.py